#!/usr/bin/python
# -*- coding: utf-8 -*-

import struct
import sys
import binascii
import tkinter as tk
from tkinter import filedialog, messagebox
from tkinter.ttk import Progressbar
import os

# 拼音表偏移
startPy = 0x1540

# 汉语词组表偏移
startChinese = 0x2628

# 全局拼音表
GPy_Table = {}

# 解析结果
# 元组(词频,拼音,中文词组)的列表
GTable = []

def byte2str(data):
    '''将原始字节码转为字符串'''
    i = 0
    length = len(data)
    ret = u''
    while i < length:
        x = data[i:i+2]
        t = chr(struct.unpack('H', x)[0])
        if t == u'\r':
            ret += u'\n'
        elif t != u' ':
            ret += t
        i += 2
    return ret

# 获取拼音表
def getPyTable(data):
    if data[0:4] != bytes(map(ord,"\x9D\x01\x00\x00")):
        return None
    data = data[4:]
    pos = 0
    length = len(data)
    while pos < length:
        index = struct.unpack('H', data[pos:pos +2])[0]
        pos += 2
        l = struct.unpack('H', data[pos:pos + 2])[0]
        pos += 2
        py = byte2str(data[pos:pos + l])
        GPy_Table[index] = py
        pos += l

# 获取一个词组的拼音
def getWordPy(data):
    pos = 0
    length = len(data)
    ret = u''
    while pos < length:
        index = struct.unpack('H', data[pos:pos + 2])[0]
        ret += GPy_Table[index]
        pos += 2
    return ret

# 读取中文表
def getChinese(data):
    pos = 0
    length = len(data)
    while pos < length:
        # 同音词数量
        same = struct.unpack('H', data[pos:pos + 2])[0]

        # 拼音索引表长度
        pos += 2
        py_table_len = struct.unpack('H', data[pos:pos + 2])[0]
        # 拼音索引表
        pos += 2
        py = getWordPy(data[pos: pos + py_table_len])

        # 中文词组
        pos += py_table_len
        for i in range(same):
            # 中文词组长度
            c_len = struct.unpack('H', data[pos:pos +2])[0]
            # 中文词组
            pos += 2
            word = byte2str(data[pos: pos + c_len])
            # 扩展数据长度
            pos += c_len
            ext_len = struct.unpack('H', data[pos:pos +2])[0]
            # 词频
            pos += 2
            count = struct.unpack('H', data[pos:pos +2])[0]

            # 保存
            GTable.append((count, py, word))

            # 到下个词的偏移位置
            pos += ext_len

def deal(file_name):
    global GTable, GPy_Table
    GTable = []
    GPy_Table = {}
    
    try:
        with open(file_name, 'rb') as f:
            data = f.read()

        if data[0:12] != bytes(map(ord,"\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00")):
            raise ValueError("不是搜狗词库文件格式(.scel)")

        # 获取词库信息
        dict_info = {
            "词库名": byte2str(data[0x130:0x338]),
            "词库类型": byte2str(data[0x338:0x540]),
            "描述信息": byte2str(data[0x540:0xd40]),
            "词库示例": byte2str(data[0xd40:startPy])
        }

        getPyTable(data[startPy:startChinese])
        getChinese(data[startChinese:])
        return True, dict_info
    except Exception as e:
        return False, str(e)

class ScelConverterGUI:
    def __init__(self):
        self.window = tk.Tk()
        self.window.title("搜狗词库转换工具")
        self.window.geometry("600x400")

        # 创建主框架
        self.main_frame = tk.Frame(self.window, padx=20, pady=20)
        self.main_frame.pack(fill=tk.BOTH, expand=True)

        # 文件选择区域
        self.file_frame = tk.Frame(self.main_frame)
        self.file_frame.pack(fill=tk.X, pady=(0, 20))

        self.file_label = tk.Label(self.file_frame, text="选择词库文件：")
        self.file_label.pack(side=tk.LEFT)

        self.file_path = tk.StringVar()
        self.file_entry = tk.Entry(self.file_frame, textvariable=self.file_path, width=50)
        self.file_entry.pack(side=tk.LEFT, padx=(10, 10))

        self.browse_button = tk.Button(self.file_frame, text="浏览", command=self.browse_file)
        self.browse_button.pack(side=tk.LEFT)

        # 信息显示区域
        self.info_text = tk.Text(self.main_frame, height=10, width=60)
        self.info_text.pack(fill=tk.BOTH, expand=True)

        # 按钮区域
        self.button_frame = tk.Frame(self.main_frame)
        self.button_frame.pack(fill=tk.X, pady=(20, 0))

        self.convert_button = tk.Button(self.button_frame, text="转换SCEL", command=self.convert_file)
        self.convert_button.pack(side=tk.RIGHT)

        self.batch_convert_button = tk.Button(self.button_frame, text="批量转换并合并", command=self.batch_convert_and_merge)
        self.batch_convert_button.pack(side=tk.RIGHT, padx=(0, 10))

        self.convert_scel_to_cin_button = tk.Button(self.button_frame, text="直接转换为CIN", command=self.convert_scel_to_cin)
        self.convert_scel_to_cin_button.pack(side=tk.RIGHT, padx=(0, 10))

        self.merge_cin_button = tk.Button(self.button_frame, text="合并CIN", command=self.merge_cin_files)
        self.merge_cin_button.pack(side=tk.RIGHT, padx=(0, 10))

        self.convert_cin_button = tk.Button(self.button_frame, text="TXT转CIN", command=self.convert_to_cin)
        self.convert_cin_button.pack(side=tk.RIGHT, padx=(0, 10))

    def browse_file(self):
        file_path = filedialog.askopenfilename(
            title="选择搜狗词库文件",
            filetypes=[("Scel Files", "*.scel"), ("All Files", "*.*")]
        )
        if file_path:
            self.file_path.set(file_path)

    def convert_file(self):
        file_path = self.file_path.get()
        if not file_path:
            messagebox.showerror("错误", "请先选择一个词库文件！")
            return

        self.info_text.delete(1.0, tk.END)
        self.info_text.insert(tk.END, "正在转换词库...")
        self.window.update()

        success, result = deal(file_path)
        if not success:
            messagebox.showerror("错误", f"转换失败：{result}")
            return

        # 显示词库信息
        self.info_text.delete(1.0, tk.END)
        for key, value in result.items():
            self.info_text.insert(tk.END, f"{key}：{value}\n")

        # 保存文件对话框
        save_path = filedialog.asksaveasfilename(
            title="保存转换结果",
            defaultextension=".txt",
            filetypes=[("Text Files", "*.txt"), ("All Files", "*.*")],
            initialfile=os.path.splitext(os.path.basename(file_path))[0] + ".txt"
        )

        if save_path:
            try:
                with open(save_path, 'w', encoding='utf-8') as f:
                    for count, py, word in GTable:
                        f.write(f"{count}{py} {word}\n")
                messagebox.showinfo("成功", f"转换完成！\n共转换{len(GTable)}个词条\n已保存到：{save_path}")
            except Exception as e:
                messagebox.showerror("错误", f"保存文件失败：{str(e)}")

    def convert_to_cin(self):
        file_path = self.file_path.get()
        if not file_path:
            messagebox.showerror("错误", "请先选择一个文件！")
            return

        if not file_path.endswith('.txt'):
            messagebox.showerror("错误", "请选择txt格式的文件！")
            return

        self.info_text.delete(1.0, tk.END)
        self.info_text.insert(tk.END, "正在转换为CIN格式...")
        self.window.update()

        # 生成输出文件路径
        cin_file = os.path.splitext(file_path)[0] + ".cin"

        # 导入txt2cin模块
        import txt2cin

        # 转换文件
        if txt2cin.convert_txt_to_cin(file_path, cin_file):
            self.info_text.delete(1.0, tk.END)
            self.info_text.insert(tk.END, f"转换完成！\n文件已保存到：{cin_file}")
        else:
            messagebox.showerror("错误", "转换失败！")

    def convert_scel_to_cin(self):
        file_path = self.file_path.get()
        if not file_path:
            messagebox.showerror("错误", "请先选择一个词库文件！")
            return

        if not file_path.endswith('.scel'):
            messagebox.showerror("错误", "请选择scel格式的文件！")
            return

        self.info_text.delete(1.0, tk.END)
        self.info_text.insert(tk.END, "正在转换为CIN格式...")
        self.window.update()

        # 首先转换scel文件
        success, result = deal(file_path)
        if not success:
            messagebox.showerror("错误", f"转换失败：{result}")
            return

        # 生成临时txt文件路径
        temp_txt = os.path.splitext(file_path)[0] + "_temp.txt"
        cin_file = os.path.splitext(file_path)[0] + ".cin"

        try:
            # 保存为临时txt文件
            with open(temp_txt, 'w', encoding='utf-8') as f:
                for count, py, word in GTable:
                    f.write(f"{{{count}}}{py} {word}\n")

            # 导入txt2cin模块并转换为cin格式
            import txt2cin
            if txt2cin.convert_txt_to_cin(temp_txt, cin_file):
                self.info_text.delete(1.0, tk.END)
                self.info_text.insert(tk.END, f"转换完成！\n共转换{len(GTable)}个词条\n已保存到：{cin_file}")
            else:
                messagebox.showerror("错误", "转换失败！")

        except Exception as e:
            messagebox.showerror("错误", f"转换失败：{str(e)}")
        finally:
            # 删除临时txt文件
            if os.path.exists(temp_txt):
                os.remove(temp_txt)

    def batch_convert_and_merge(self):
        # 选择多个scel文件
        file_paths = filedialog.askopenfilenames(
            title="选择搜狗词库文件",
            filetypes=[("Scel Files", "*.scel"), ("All Files", "*.*")]
        )
        
        if not file_paths:
            return

        # 选择保存路径
        save_path = filedialog.asksaveasfilename(
            title="保存合并后的文件",
            defaultextension=".cin",
            filetypes=[("CIN Files", "*.cin"), ("All Files", "*.*")]
        )

        if not save_path:
            return

        try:
            # 用于存储所有词条
            all_entries = set()
            total_entries = 0

            self.info_text.delete(1.0, tk.END)
            self.window.update()

            # 处理每个scel文件
            for file_path in file_paths:
                self.info_text.insert(tk.END, f"正在处理：{os.path.basename(file_path)}\n")
                self.window.update()

                # 转换scel文件
                success, result = deal(file_path)
                if not success:
                    self.info_text.insert(tk.END, f"转换失败：{result}\n")
                    self.window.update()
                    continue

                # 将词条添加到集合中（自动去重）
                for count, py, word in GTable:
                    all_entries.add(f"{py} {word}")
                total_entries += len(GTable)

            if not all_entries:
                messagebox.showerror("错误", "没有成功转换任何词条！")
                return

            # 将词条转换为列表并排序
            sorted_entries = sorted(list(all_entries))

            # 写入合并后的文件
            with open(save_path, 'w', encoding='utf-8') as f:
                # 写入标准头部
                import txt2cin
                txt2cin.write_cin_header(f)

                # 写入去重后的词条
                for entry in sorted_entries:
                    f.write(entry + '\n')

            self.info_text.delete(1.0, tk.END)
            self.info_text.insert(tk.END, f"转换完成！\n原始词条数：{total_entries}\n去重后词条数：{len(sorted_entries)}\n已保存到：{save_path}")

        except Exception as e:
            messagebox.showerror("错误", f"转换失败：{str(e)}")

    def merge_cin_files(self):
        # 选择多个cin文件
        file_paths = filedialog.askopenfilenames(
            title="选择要合并的CIN文件",
            filetypes=[("CIN Files", "*.cin"), ("All Files", "*.*")]
        )
        
        if not file_paths:
            return

        # 选择保存路径
        save_path = filedialog.asksaveasfilename(
            title="保存合并后的文件",
            defaultextension=".cin",
            filetypes=[("CIN Files", "*.cin"), ("All Files", "*.*")]
        )

        if not save_path:
            return

        try:
            # 用于存储所有词条
            all_entries = set()

            # 读取所有cin文件的内容
            for file_path in file_paths:
                with open(file_path, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                    # 跳过头部配置，只处理词条
                    in_entries = False
                    for line in lines:
                        line = line.strip()
                        if not line:
                            continue
                        if line.startswith('%keyname end'):
                            in_entries = True
                            continue
                        if in_entries and not line.startswith('%'):
                            all_entries.add(line)

            # 将词条转换为列表并排序
            sorted_entries = sorted(list(all_entries))

            # 写入合并后的文件
            with open(save_path, 'w', encoding='utf-8') as f:
                # 写入标准头部
                import txt2cin
                txt2cin.write_cin_header(f)

                # 写入去重后的词条
                for entry in sorted_entries:
                    f.write(entry + '\n')

            self.info_text.delete(1.0, tk.END)
            self.info_text.insert(tk.END, f"合并完成！\n共合并{len(sorted_entries)}个词条\n已保存到：{save_path}")

        except Exception as e:
            messagebox.showerror("错误", f"合并失败：{str(e)}")

    def run(self):
        self.window.mainloop()

if __name__ == '__main__':
    app = ScelConverterGUI()
    app.run()